{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
   },
   "outputs": [],
   "source": [
    "!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import random\n",
    "import pandas as pd\n",
    "import joblib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
    "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "import datatable as dt\n",
    "import lightgbm as lgb\n",
    "from matplotlib import pyplot as plt\n",
    "import riiideducation\n",
    "from sklearn.metrics import roc_auc_score\n",
    "import gc\n",
    "\n",
    "_ = np.seterr(divide='ignore', invalid='ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_types_dict = {\n",
    "    'timestamp': 'int64',\n",
    "    'user_id': 'int32', \n",
    "    'content_id': 'int16', \n",
    "    'content_type_id':'int8', \n",
    "    'task_container_id': 'int16',\n",
    "    #'user_answer': 'int8',\n",
    "    'answered_correctly': 'int8', \n",
    "    'prior_question_elapsed_time': 'float32', \n",
    "    'prior_question_had_explanation': 'bool'\n",
    "}\n",
    "target = 'answered_correctly'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('start read train data...')\n",
    "train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_df=train_df.sample(frac=0.1).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('start handle lecture data...')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#reading in lecture df\n",
    "lectures_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lectures_df['type_of'] = lectures_df['type_of'].replace('solving question', 'solving_question')\n",
    "\n",
    "lectures_df = pd.get_dummies(lectures_df, columns=['part', 'type_of'])\n",
    "\n",
    "part_lectures_columns = [column for column in lectures_df.columns if column.startswith('part')]\n",
    "\n",
    "types_of_lectures_columns = [column for column in lectures_df.columns if column.startswith('type_of_')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_lectures = train_df[train_df.content_type_id == True].merge(lectures_df, left_on='content_id', right_on='lecture_id', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_lecture_stats_part = train_lectures.groupby('user_id',as_index = False)[part_lectures_columns + types_of_lectures_columns].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lecturedata_types_dict = {   \n",
    "    'user_id': 'int32', \n",
    "    'part_1': 'int8',\n",
    "    'part_2': 'int8',\n",
    "    'part_3': 'int8',\n",
    "    'part_4': 'int8',\n",
    "    'part_5': 'int8',\n",
    "    'part_6': 'int8',\n",
    "    'part_7': 'int8',\n",
    "    'type_of_concept': 'int8',\n",
    "    'type_of_intention': 'int8',\n",
    "    'type_of_solving_question': 'int8',\n",
    "    'type_of_starter': 'int8'\n",
    "}\n",
    "user_lecture_stats_part = user_lecture_stats_part.astype(lecturedata_types_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for column in user_lecture_stats_part.columns:\n",
    "    #bool_column = column + '_boolean'\n",
    "    if(column !='user_id'):\n",
    "        user_lecture_stats_part[column] = (user_lecture_stats_part[column] > 0).astype('int8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_lecture_stats_part.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#clearing memory\n",
    "del(train_lectures)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_lecture_agg = train_df.groupby('user_id')['content_type_id'].agg(['sum', 'count'])\n",
    "user_lecture_agg=user_lecture_agg.astype('int16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#1= if the event was the user watching a lecture.\n",
    "cum = train_df.groupby('user_id')['content_type_id'].agg(['cumsum', 'cumcount'])\n",
    "cum['cumcount']=cum['cumcount']+1\n",
    "train_df['user_interaction_count'] = cum['cumcount'] \n",
    "train_df['user_interaction_timestamp_mean'] = train_df['timestamp']/cum['cumcount'] \n",
    "train_df['user_lecture_sum'] = cum['cumsum'] \n",
    "train_df['user_lecture_lv'] = cum['cumsum'] / cum['cumcount']\n",
    "\n",
    "\n",
    "train_df.user_lecture_lv=train_df.user_lecture_lv.astype('float16')\n",
    "train_df.user_lecture_sum=train_df.user_lecture_sum.astype('int16')\n",
    "train_df.user_interaction_count=train_df.user_interaction_count.astype('int16')\n",
    "train_df['user_interaction_timestamp_mean']=train_df['user_interaction_timestamp_mean']/(1000*3600)\n",
    "train_df.user_interaction_timestamp_mean=train_df.user_interaction_timestamp_mean.astype('float32')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pd.options.display.max_rows = 200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del cum\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('start handle train_df...')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['prior_question_had_explanation'].fillna(False, inplace=True)\n",
    "train_df = train_df.astype(data_types_dict)\n",
    "train_df = train_df[train_df[target] != -1].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "content_explation_agg=train_df[[\"content_id\",\"prior_question_had_explanation\",target]].groupby([\"content_id\",\"prior_question_had_explanation\"])[target].agg(['mean'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "content_explation_agg.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "content_explation_agg=content_explation_agg.unstack()\n",
    "\n",
    "content_explation_agg=content_explation_agg.reset_index()\n",
    "content_explation_agg.columns = ['content_id', 'content_explation_false_mean','content_explation_true_mean']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "content_explation_agg.content_id=content_explation_agg.content_id.astype('int16')\n",
    "content_explation_agg.content_explation_false_mean=content_explation_agg.content_explation_false_mean.astype('float16')\n",
    "content_explation_agg.content_explation_true_mean=content_explation_agg.content_explation_true_mean.astype('float16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('start handle attempt_no...')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "train_df[\"attempt_no\"] = 1\n",
    "train_df.attempt_no=train_df.attempt_no.astype('int8')\n",
    "#\n",
    "attempt_no_agg=train_df.groupby([\"user_id\",\"content_id\"])[\"attempt_no\"].agg(['sum']).astype('int8')\n",
    "#attempt_no_agg=attempt_no_agg.astype('int8')\n",
    "train_df[\"attempt_no\"] = train_df[[\"user_id\",\"content_id\",'attempt_no']].groupby([\"user_id\",\"content_id\"])[\"attempt_no\"].cumsum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#attempt_no_agg=attempt_no_agg.reset_index()\n",
    "attempt_no_agg=attempt_no_agg[attempt_no_agg['sum'] >1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('start handle timestamp...')\n",
    "prior_question_elapsed_time_mean=train_df['prior_question_elapsed_time'].mean()\n",
    "train_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_timestamp_u = train_df[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()\n",
    "max_timestamp_u.columns = ['user_id', 'max_time_stamp']\n",
    "max_timestamp_u.user_id=max_timestamp_u.user_id.astype('int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "train_df['lagtime'] = train_df.groupby('user_id')['timestamp'].shift()\n",
    "\n",
    "max_timestamp_u2 = train_df[['user_id','lagtime']].groupby(['user_id']).agg(['max']).reset_index()\n",
    "max_timestamp_u2.columns = ['user_id', 'max_time_stamp2']\n",
    "max_timestamp_u2.user_id=max_timestamp_u2.user_id.astype('int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['lagtime']=train_df['timestamp']-train_df['lagtime']\n",
    "lagtime_mean=train_df['lagtime'].mean()\n",
    "train_df['lagtime'].fillna(lagtime_mean, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['lagtime']=train_df['lagtime']/(1000*3600)\n",
    "train_df.lagtime=train_df.lagtime.astype('float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# lagtime_agg = train_df.groupby('user_id')['lagtime'].agg(['mean'])\n",
    "# train_df['lagtime_mean'] = train_df['user_id'].map(lagtime_agg['mean'])\n",
    "# train_df.lagtime_mean=train_df.lagtime_mean.astype('int32')\n",
    "# lagtime_agg=lagtime_agg.astype('int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "train_df['lagtime2'] = train_df.groupby('user_id')['timestamp'].shift(2)\n",
    "\n",
    "max_timestamp_u3 = train_df[['user_id','lagtime2']].groupby(['user_id']).agg(['max']).reset_index()\n",
    "max_timestamp_u3.columns = ['user_id', 'max_time_stamp3']\n",
    "max_timestamp_u3.user_id=max_timestamp_u3.user_id.astype('int32')\n",
    "\n",
    "train_df['lagtime2']=train_df['timestamp']-train_df['lagtime2']\n",
    "lagtime_mean2=train_df['lagtime2'].mean()\n",
    "train_df['lagtime2'].fillna(lagtime_mean2, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['lagtime2']=train_df['lagtime2']/(1000*3600)\n",
    "train_df.lagtime2=train_df.lagtime2.astype('float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "train_df['lagtime3'] = train_df.groupby('user_id')['timestamp'].shift(3)\n",
    "\n",
    "train_df['lagtime3']=train_df['timestamp']-train_df['lagtime3']\n",
    "lagtime_mean3=train_df['lagtime3'].mean()\n",
    "train_df['lagtime3'].fillna(lagtime_mean3, inplace=True)\n",
    "train_df['lagtime3']=train_df['lagtime3']/(1000*3600)\n",
    "train_df.lagtime3=train_df.lagtime3.astype('float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# lagtime_agg2 = train_df.groupby('user_id')['lagtime2'].agg(['mean'])\n",
    "# train_df['lagtime_mean2'] = train_df['user_id'].map(lagtime_agg2['mean'])\n",
    "# train_df.lagtime_mean2=train_df.lagtime_mean2.astype('int32')\n",
    "# lagtime_agg2=lagtime_agg2.astype('int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['timestamp']=train_df['timestamp']/(1000*3600)\n",
    "#\n",
    "train_df.timestamp=train_df.timestamp.astype('float16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_prior_question_elapsed_time = train_df[['user_id','prior_question_elapsed_time']].groupby(['user_id']).tail(1)\n",
    "user_prior_question_elapsed_time.columns = ['user_id', 'prior_question_elapsed_time']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "train_df['delta_prior_question_elapsed_time'] = train_df.groupby('user_id')['prior_question_elapsed_time'].shift()\n",
    "train_df['delta_prior_question_elapsed_time']=train_df['prior_question_elapsed_time']-train_df['delta_prior_question_elapsed_time']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "delta_prior_question_elapsed_time_mean=train_df['delta_prior_question_elapsed_time'].mean()\n",
    "train_df['delta_prior_question_elapsed_time'].fillna(delta_prior_question_elapsed_time_mean, inplace=True)\n",
    "train_df.delta_prior_question_elapsed_time=train_df.delta_prior_question_elapsed_time.astype('int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "train_df['lag'] = train_df.groupby('user_id')[target].shift()\n",
    "\n",
    "cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])\n",
    "##cum['cumcount']=cum['cumcount']+1\n",
    "user_agg = train_df.groupby('user_id')['lag'].agg(['sum', 'count']).astype('int16')\n",
    "cum['cumsum'].fillna(0, inplace=True)\n",
    "\n",
    "train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']\n",
    "train_df['user_correct_count'] = cum['cumsum']\n",
    "train_df['user_uncorrect_count'] = cum['cumcount']-cum['cumsum']\n",
    "#train_df['user_answer_count'] = cum['cumcount']\n",
    "train_df.drop(columns=['lag'], inplace=True)\n",
    "train_df['user_correctness'].fillna(0.67, inplace=True)\n",
    "train_df.user_correctness=train_df.user_correctness.astype('float16')\n",
    "train_df.user_correct_count=train_df.user_correct_count.astype('int16')\n",
    "train_df.user_uncorrect_count=train_df.user_uncorrect_count.astype('int16')\n",
    "#train_df.user_answer_count=train_df.user_answer_count.astype('int16')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del cum\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_df['lag'] = train_df.groupby('user_id')[target].shift(2)\n",
    "# cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])\n",
    "# ##cum['cumcount']=cum['cumcount']+1\n",
    "# user_agg2 = train_df.groupby('user_id')['lag'].agg(['sum', 'count']).astype('int16')\n",
    "# cum['cumsum'].fillna(0, inplace=True)\n",
    "\n",
    "# train_df['user_correctness2'] = cum['cumsum'] / cum['cumcount']\n",
    "# train_df['user_correct_count2'] = cum['cumsum']\n",
    "# train_df['user_uncorrect_count2'] = cum['cumcount']-cum['cumsum']\n",
    "# #train_df['user_answer_count2'] = cum['cumcount']\n",
    "# train_df.drop(columns=['lag'], inplace=True)\n",
    "# train_df['user_correctness2'].fillna(0.67, inplace=True)\n",
    "# train_df.user_correctness2=train_df.user_correctness2.astype('float16')\n",
    "# train_df.user_correct_count2=train_df.user_correct_count2.astype('int16')\n",
    "# train_df.user_uncorrect_count2=train_df.user_uncorrect_count2.astype('int16')\n",
    "# #train_df.user_answer_count2=train_df.user_answer_count2.astype('int16')\n",
    "# del cum\n",
    "# gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.prior_question_had_explanation=train_df.prior_question_had_explanation.astype('int8')\n",
    "explanation_agg = train_df.groupby('user_id')['prior_question_had_explanation'].agg(['sum', 'count'])\n",
    "explanation_agg=explanation_agg.astype('int16')\n",
    "# explanation_agg.sum=explanation_agg.sum.astype('int16')\n",
    "# explanation_agg.count=explanation_agg.count.astype('int16')\n",
    "#explanation_agg.var=explanation_agg.var.astype('float16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "#train_df['lag'] = train_df.groupby('user_id')['prior_question_had_explanation'].shift()\n",
    "\n",
    "cum = train_df.groupby('user_id')['prior_question_had_explanation'].agg(['cumsum', 'cumcount'])\n",
    "cum['cumcount']=cum['cumcount']+1\n",
    "train_df['explanation_mean'] = cum['cumsum'] / cum['cumcount']\n",
    "train_df['explanation_true_count'] = cum['cumsum'] \n",
    "train_df['explanation_false_count'] =  cum['cumcount']-cum['cumsum']\n",
    "#train_df.drop(columns=['lag'], inplace=True)\n",
    "\n",
    "train_df.explanation_mean=train_df.explanation_mean.astype('float16')\n",
    "train_df.explanation_true_count=train_df.explanation_true_count.astype('int16')\n",
    "train_df.explanation_false_count=train_df.explanation_false_count.astype('int16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del cum\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count','var'])\n",
    "task_container_agg = train_df.groupby('task_container_id')[target].agg(['sum', 'count','var'])\n",
    "content_agg=content_agg.astype('float32')\n",
    "task_container_agg=task_container_agg.astype('float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#\n",
    "train_df['task_container_uncor_count'] = train_df['task_container_id'].map(task_container_agg['count']-task_container_agg['sum']).astype('int32')\n",
    "train_df['task_container_cor_count'] = train_df['task_container_id'].map(task_container_agg['sum']).astype('int32')\n",
    "train_df['task_container_std'] = train_df['task_container_id'].map(task_container_agg['var']).astype('float16')\n",
    "train_df['task_container_correctness'] = train_df['task_container_id'].map(task_container_agg['sum'] / task_container_agg['count'])\n",
    "train_df.task_container_correctness=train_df.task_container_correctness.astype('float16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "content_elapsed_time_agg=train_df.groupby('content_id')['prior_question_elapsed_time'].agg(['mean'])\n",
    "content_had_explanation_agg=train_df.groupby('content_id')['prior_question_had_explanation'].agg(['mean'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('start questions data...')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "questions_df = pd.read_csv(\n",
    "    '../input/riiid-test-answer-prediction/questions.csv', \n",
    "    usecols=[0, 1,3,4],\n",
    "    dtype={'question_id': 'int16','bundle_id': 'int16', 'part': 'int8','tags': 'str'}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bundle_agg = questions_df.groupby('bundle_id')['question_id'].agg(['count'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "questions_df['content_sub_bundle'] = questions_df['bundle_id'].map(bundle_agg['count']).astype('int8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "questions_df['tags'].fillna('188', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def gettags(tags,num):\n",
    "    tags_splits=tags.split(\" \")\n",
    "    result='' \n",
    "    for t in tags_splits:\n",
    "        x=int(t)\n",
    "        if(x<32*(num+1) and x>=32*num):#num \n",
    "            result=result+' '+t\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "for num in range(0,6):\n",
    "    questions_df[\"tags\"+str(num)] = questions_df[\"tags\"].apply(lambda row: gettags(row,num))\n",
    "    le = LabelEncoder()\n",
    "    le.fit(np.unique(questions_df['tags'+str(num)].values))\n",
    "    #questions_df[['tags'+str(num)]=\n",
    "    questions_df['tags'+str(num)]=questions_df[['tags'+str(num)]].apply(le.transform)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "questions_df_dict = {   \n",
    "    'tags0': 'int8',\n",
    "    'tags1': 'int8',\n",
    "    'tags2': 'int8',\n",
    "    'tags3': 'int8',\n",
    "    'tags4': 'int8',\n",
    "    'tags5': 'int8',\n",
    "    #'tags6': 'int8',\n",
    "    #'tags7': 'int8'\n",
    "}\n",
    "questions_df = questions_df.astype(questions_df_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "questions_df.drop(columns=['tags'], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "questions_df['part_bundle_id']=questions_df['part']*100000+questions_df['bundle_id']\n",
    "questions_df.part_bundle_id=questions_df.part_bundle_id.astype('int32')\n",
    "# tag = questions_df[\"tags\"].str.split(\" \", n = 10, expand = True)\n",
    "# tag.columns = ['tags1','tags2','tags3','tags4','tags5','tags6']\n",
    "# #\n",
    "\n",
    "# tag.fillna(0, inplace=True)\n",
    "# tag = tag.astype('int16')\n",
    "# questions_df =  pd.concat([questions_df,tag],axis=1).drop(['tags'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# questions_cmnts = pd.read_csv(\n",
    "#     '../input/2020-r3id-clustering-question-tags/question_cmnts.csv', \n",
    "#     usecols=[1,2],\n",
    "#     dtype={'question_id': 'int16','community': 'int8'}\n",
    "# )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# questions_df = pd.merge(questions_df, questions_cmnts, on='question_id', how='left',right_index=True)#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "questions_df.rename(columns={'question_id':'content_id'}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "questions_df = pd.merge(questions_df, content_explation_agg, on='content_id', how='left',right_index=True)#\n",
    "# questions_df.content_explation_false_mean=questions_df.content_explation_false_mean.astype('float16')\n",
    "# questions_df.content_explation_true_mean=questions_df.content_explation_true_mean.astype('float16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del content_explation_agg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "questions_df['content_correctness'] = questions_df['content_id'].map(content_agg['sum'] / content_agg['count'])\n",
    "questions_df.content_correctness=questions_df.content_correctness.astype('float16')\n",
    "questions_df['content_correctness_std'] = questions_df['content_id'].map(content_agg['var'])\n",
    "questions_df.content_correctness_std=questions_df.content_correctness_std.astype('float16')\n",
    "questions_df['content_uncorrect_count'] = questions_df['content_id'].map(content_agg['count']-content_agg['sum']).astype('int32')\n",
    "questions_df['content_correct_count'] = questions_df['content_id'].map(content_agg['sum']).astype('int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "questions_df['content_elapsed_time_mean'] = questions_df['content_id'].map(content_elapsed_time_agg['mean'])\n",
    "questions_df.content_elapsed_time_mean=questions_df.content_elapsed_time_mean.astype('float16')\n",
    "questions_df['content_had_explanation_mean'] = questions_df['content_id'].map(content_had_explanation_agg['mean'])\n",
    "questions_df.content_had_explanation_mean=questions_df.content_had_explanation_mean.astype('float16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del content_elapsed_time_agg\n",
    "del content_had_explanation_agg\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "part_agg = questions_df.groupby('part')['content_correctness'].agg(['mean', 'var'])\n",
    "questions_df['part_correctness_mean'] = questions_df['part'].map(part_agg['mean'])\n",
    "questions_df['part_correctness_std'] = questions_df['part'].map(part_agg['var'])\n",
    "questions_df.part_correctness_mean=questions_df.part_correctness_mean.astype('float16')\n",
    "questions_df.part_correctness_std=questions_df.part_correctness_std.astype('float16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "part_agg = questions_df.groupby('part')['content_uncorrect_count'].agg(['sum'])\n",
    "questions_df['part_uncor_count'] = questions_df['part'].map(part_agg['sum']).astype('int32')\n",
    "#\n",
    "part_agg = questions_df.groupby('part')['content_correct_count'].agg(['sum'])\n",
    "questions_df['part_cor_count'] = questions_df['part'].map(part_agg['sum']).astype('int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bundle_agg = questions_df.groupby('bundle_id')['content_correctness'].agg(['mean'])\n",
    "questions_df['bundle_correctness_mean'] = questions_df['bundle_id'].map(bundle_agg['mean'])\n",
    "questions_df.bundle_correctness_mean=questions_df.bundle_correctness_mean.astype('float16')\n",
    "\n",
    "# bundle_agg = questions_df.groupby('bundle_id')['content_uncorrect_count'].agg(['sum'])\n",
    "# questions_df['bundle_uncor_count'] = questions_df['bundle_id'].map(bundle_agg['sum']).astype('int32')\n",
    "# #\n",
    "# bundle_agg = questions_df.groupby('bundle_id')['content_correct_count'].agg(['sum'])\n",
    "# questions_df['bundle_cor_count'] = questions_df['bundle_id'].map(bundle_agg['sum']).astype('int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#questions_df.loc[questions_df['content_sub_bundle'] !=0, 'content_sub_bundle']=1 #修改列\"content_sub_bundle\"的值，推荐使用.loc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tags1_agg = questions_df.groupby('tags0')['content_correctness'].agg(['mean', 'var'])\n",
    "# questions_df['tags0_correctness_mean'] = questions_df['tags0'].map(tags1_agg['mean'])\n",
    "# questions_df.tags0_correctness_mean=questions_df.tags0_correctness_mean.astype('float16')\n",
    "\n",
    "# tags1_agg = questions_df.groupby('tags1')['content_correctness'].agg(['mean', 'var'])\n",
    "# questions_df['tags1_correctness_mean'] = questions_df['tags1'].map(tags1_agg['mean'])\n",
    "# questions_df.tags1_correctness_mean=questions_df.tags1_correctness_mean.astype('float16')\n",
    "\n",
    "# tags1_agg = questions_df.groupby('tags2')['content_correctness'].agg(['mean', 'var'])\n",
    "# questions_df['tags2_correctness_mean'] = questions_df['tags2'].map(tags1_agg['mean'])\n",
    "# questions_df.tags2_correctness_mean=questions_df.tags2_correctness_mean.astype('float16')\n",
    "\n",
    "# tags1_agg = questions_df.groupby('tags4')['content_correctness'].agg(['mean', 'var'])\n",
    "# questions_df['tags4_correctness_mean'] = questions_df['tags4'].map(tags1_agg['mean'])\n",
    "# questions_df.tags4_correctness_mean=questions_df.tags4_correctness_mean.astype('float16')\n",
    "\n",
    "# questions_df['tags1_correctness_std'] = questions_df['tags1'].map(tags1_agg['var'])\n",
    "\n",
    "# questions_df.tags1_correctness_std=questions_df.tags1_correctness_std.astype('float16')\n",
    "# tags1_agg = questions_df.groupby('tags1')['content_uncorrect_count'].agg(['sum'])\n",
    "# questions_df['tags1_uncor_count'] = questions_df['tags1'].map(tags1_agg['sum']).astype('int32')\n",
    "# #\n",
    "# tags1_agg = questions_df.groupby('tags1')['content_correct_count'].agg(['sum'])\n",
    "# questions_df['tags1_cor_count'] = questions_df['tags1'].map(tags1_agg['sum']).astype('int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "questions_df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del content_agg\n",
    "del bundle_agg\n",
    "del part_agg\n",
    "#del tags1_agg\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pd.set_option(\"display.max_columns\",500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#questions_df.drop(columns=['tags4','tags5','tags6'], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(train_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#train_df.drop(columns=['content_type_id'], inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features_dict = {\n",
    "    #'user_id',\n",
    "    'timestamp':'float16',#\n",
    "    'user_interaction_count':'int16',\n",
    "    'user_interaction_timestamp_mean':'float32',\n",
    "    'lagtime':'float32',#\n",
    "    'lagtime2':'float32',\n",
    "    'lagtime3':'float32',\n",
    "    #'lagtime_mean':'int32',\n",
    "    'content_id':'int16',\n",
    "    'task_container_id':'int16',\n",
    "    'user_lecture_sum':'int16',#\n",
    "    'user_lecture_lv':'float16',##\n",
    "    'prior_question_elapsed_time':'float32',#\n",
    "    'delta_prior_question_elapsed_time':'int32',#\n",
    "    'user_correctness':'float16',#\n",
    "    'user_uncorrect_count':'int16',#\n",
    "    'user_correct_count':'int16',#\n",
    "    #'content_correctness':'float16',\n",
    "    'content_correctness_std':'float16',\n",
    "    'content_correct_count':'int32',\n",
    "    'content_uncorrect_count':'int32',#\n",
    "    'content_elapsed_time_mean':'float16',\n",
    "    'content_had_explanation_mean':'float16',\n",
    "    'content_explation_false_mean':'float16',\n",
    "    'content_explation_true_mean':'float16',\n",
    "    'task_container_correctness':'float16',\n",
    "    'task_container_std':'float16',\n",
    "    'task_container_cor_count':'int32',#\n",
    "    'task_container_uncor_count':'int32',#\n",
    "    'attempt_no':'int8',#\n",
    "    'part':'int8',\n",
    "    'part_correctness_mean':'float16',\n",
    "    'part_correctness_std':'float16',\n",
    "    'part_uncor_count':'int32',\n",
    "    'part_cor_count':'int32',\n",
    "    'tags0': 'int8',\n",
    "    'tags1': 'int8',\n",
    "    'tags2': 'int8',\n",
    "    'tags3': 'int8',\n",
    "    'tags4': 'int8',\n",
    "    'tags5': 'int8',\n",
    "   # 'tags6': 'int8',\n",
    "   # 'tags7': 'int8',\n",
    "#     'tags0_correctness_mean':'float16',\n",
    "#     'tags1_correctness_mean':'float16',\n",
    "#     'tags2_correctness_mean':'float16',\n",
    "#     'tags4_correctness_mean':'float16',\n",
    "#     'bundle_id':'int16',\n",
    "#     'bundle_correctness_mean':'float16',\n",
    "#     'bundle_uncor_count':'int32',\n",
    "#     'bundle_cor_count':'int32',\n",
    "    'part_bundle_id':'int32',\n",
    "    'content_sub_bundle':'int8',\n",
    "    'prior_question_had_explanation':'int8',\n",
    "    'explanation_mean':'float16', #\n",
    "    #'explanation_var',#\n",
    "    'explanation_false_count':'int16',#\n",
    "    'explanation_true_count':'int16',#\n",
    "   # 'community':'int8',\n",
    "#     'part_1',\n",
    "#     'part_2',\n",
    "#     'part_3',\n",
    "#     'part_4',\n",
    "#     'part_5',\n",
    "#     'part_6',\n",
    "#     'part_7',\n",
    "#     'type_of_concept',\n",
    "#     'type_of_intention',\n",
    "#     'type_of_solving_question',\n",
    "#     'type_of_starter'\n",
    "}\n",
    "categorical_columns= [\n",
    "    #'user_id',\n",
    "    'content_id',\n",
    "    'task_container_id',\n",
    "    'part',\n",
    "   # 'community',\n",
    "    'tags0',\n",
    "    'tags1',\n",
    "    'tags2',\n",
    "    'tags3',\n",
    "    'tags4',\n",
    "    'tags5',\n",
    "    #'tags6',\n",
    "    #'tags7',\n",
    "    #'bundle_id',\n",
    "    'part_bundle_id',\n",
    "    'content_sub_bundle',\n",
    "    'prior_question_had_explanation', \n",
    "#     'part_1',\n",
    "#     'part_2',\n",
    "#     'part_3',\n",
    "#     'part_4',\n",
    "#     'part_5',\n",
    "#     'part_6',\n",
    "#     'part_7',\n",
    "#     'type_of_concept',\n",
    "#     'type_of_intention',\n",
    "#     'type_of_solving_question',\n",
    "#     'type_of_starter'\n",
    "]\n",
    "\n",
    "features=list(features_dict.keys())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "flag_lgbm=True\n",
    "clfs = list()\n",
    "params = {\n",
    "'num_leaves': 200,\n",
    "'max_bin':450,\n",
    "# 'min_child_weight': 0.03454472573214212,\n",
    "'feature_fraction': 0.52,\n",
    "'bagging_fraction': 0.52,\n",
    "#'min_data_in_leaf': 106,\n",
    "# 'max_depth': -1,\n",
    "'objective': 'binary',\n",
    "'learning_rate': 0.05,\n",
    "\"boosting_type\": \"gbdt\",\n",
    "\"metric\": 'auc',\n",
    "# \"bagging_seed\": 11,\n",
    "# \"verbosity\": -1,\n",
    "# 'reg_alpha': 0.3899927210061127,\n",
    "# 'reg_lambda': 0.6485237330340494,\n",
    "# 'random_state': 47\n",
    "}\n",
    "trains=list()\n",
    "valids=list()\n",
    "num=1\n",
    "for i in range(0,num):\n",
    "    \n",
    "    #train_df=train_df.reset_index(drop=True)\n",
    "    #train_df_clf=train_df.sample(n=1200*10000)\n",
    "    train_df_clf=train_df[1200*10000:2*1200*10000]\n",
    "    print('sample end')\n",
    "    #train_df.drop(train_df_clf.index, inplace=True)\n",
    "    #print('train_df drop end')\n",
    "    \n",
    "   \n",
    "    del train_df\n",
    "    \n",
    "    \n",
    "    \n",
    "    users=train_df_clf['user_id'].drop_duplicates()#\n",
    "    #\n",
    "    users=users.sample(frac=0.08)\n",
    "    users_df=pd.DataFrame()\n",
    "    users_df['user_id']=users.values\n",
    "   \n",
    "   \n",
    "    valid_df_newuser = pd.merge(train_df_clf, users_df, on=['user_id'], how='inner',right_index=True)\n",
    "    del users_df\n",
    "    del users\n",
    "    gc.collect()\n",
    "    #\n",
    "    train_df_clf.drop(valid_df_newuser.index, inplace=True)\n",
    "    print('pd.merge(train_df_clf, questions_df)')\n",
    "    #-----------\n",
    "    #train_df_clf=train_df_clf.sample(frac=0.2)\n",
    "    #train_df_clf.drop(valid_df_newuser.index, inplace=True)\n",
    "    train_df_clf = pd.merge(train_df_clf, questions_df, on='content_id', how='left',right_index=True)#\n",
    "    valid_df_newuser = pd.merge(valid_df_newuser, questions_df, on='content_id', how='left',right_index=True)#\n",
    "    \n",
    "#     train_df_clf = pd.merge(train_df_clf, user_lecture_stats_part, on='user_id', how=\"left\",right_index=True)\n",
    "#     valid_df_newuser = pd.merge(valid_df_newuser, user_lecture_stats_part, on='user_id', how=\"left\",right_index=True)\n",
    "    print('valid_df')\n",
    "    valid_df=train_df_clf.sample(frac=0.1)\n",
    "    train_df_clf.drop(valid_df.index, inplace=True)\n",
    "    \n",
    "#     test_df=train_df_clf.sample(n=100*10000)\n",
    "#     train_df_clf.drop(test_df.index, inplace=True)\n",
    "   \n",
    "    valid_df = valid_df.append(valid_df_newuser)\n",
    "    del valid_df_newuser\n",
    "    gc.collect()\n",
    "    #\n",
    "\n",
    "    trains.append(train_df_clf)\n",
    "    valids.append(valid_df)\n",
    "    print('train_df_clf length：',len(train_df_clf))\n",
    "    print('valid_df length：',len(valid_df))\n",
    "    #train_df=train_df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#del train_df\n",
    "del train_df_clf\n",
    "del valid_df\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "for i in range(0,num):\n",
    "      \n",
    "#     tr_data = lgb.Dataset(trains[i][features], label=trains[i][target])\n",
    "#     va_data = lgb.Dataset(valids[i][features], label=valids[i][target])\n",
    "\n",
    "    #Don't use DF to create lightgbm dataset, rather use np array:\n",
    "    X_train_np = trains[i][features].values.astype(np.float32)\n",
    "    X_valid_np = valids[i][features].values.astype(np.float32)\n",
    "    #features = train.columns\n",
    "    tr_data = lgb.Dataset(X_train_np, label=trains[i][target], feature_name=list(features))\n",
    "    va_data = lgb.Dataset(X_valid_np, label=valids[i][target], feature_name=list(features))\n",
    "    \n",
    "\n",
    "#     del train_df_clf\n",
    "#     del valid_df\n",
    "#     gc.collect()\n",
    "    del trains\n",
    "    del valids\n",
    "    del X_train_np\n",
    "    del X_valid_np\n",
    "    gc.collect()\n",
    "\n",
    "    model = lgb.train(\n",
    "        params, \n",
    "        tr_data,\n",
    "#         train_df[features],\n",
    "#         train_df[target],\n",
    "        num_boost_round=5000,\n",
    "        #valid_sets=[(train_df[features],train_df[target]), (valid_df[features],valid_df[target])], \n",
    "        valid_sets=[tr_data, va_data],\n",
    "        early_stopping_rounds=50,\n",
    "        feature_name=features,\n",
    "        categorical_feature=categorical_columns,\n",
    "        verbose_eval=50\n",
    "    )\n",
    "    clfs.append(model)\n",
    "    #print('test-auc:', roc_auc_score(test_df[target], model.predict(test_df[features])))\n",
    "    #model.save_model(f'model.txt')\n",
    "\n",
    "\n",
    "    fig,ax = plt.subplots(figsize=(15,15))\n",
    "    lgb.plot_importance(model, ax=ax,importance_type='gain',max_num_features=50)\n",
    "    plt.show()\n",
    "\n",
    "    del tr_data\n",
    "    del va_data\n",
    "    gc.collect()\n",
    "#    \n",
    "# del trains\n",
    "# del valids\n",
    "# gc.collect()\n",
    "# del test_df\n",
    "# gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "\n",
    "\n",
    "MAX_SEQ = 100\n",
    "\n",
    "class FFN(nn.Module):\n",
    "    def __init__(self, state_size=200):\n",
    "        super(FFN, self).__init__()\n",
    "        self.state_size = state_size\n",
    "\n",
    "        self.lr1 = nn.Linear(state_size, state_size)\n",
    "        self.relu = nn.ReLU()\n",
    "        self.lr2 = nn.Linear(state_size, state_size)\n",
    "        self.dropout = nn.Dropout(0.2)\n",
    "    \n",
    "    def forward(self, x):\n",
    "        x = self.lr1(x)\n",
    "        x = self.relu(x)\n",
    "        x = self.lr2(x)\n",
    "        return self.dropout(x)\n",
    "\n",
    "def future_mask(seq_length):\n",
    "    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')\n",
    "    return torch.from_numpy(future_mask)\n",
    "\n",
    "\n",
    "class SAKTModel(nn.Module):\n",
    "    def __init__(self, n_skill, max_seq=MAX_SEQ, embed_dim=128):\n",
    "        super(SAKTModel, self).__init__()\n",
    "        self.n_skill = n_skill\n",
    "        self.embed_dim = embed_dim\n",
    "\n",
    "        self.embedding = nn.Embedding(2*n_skill+1, embed_dim)\n",
    "        self.pos_embedding = nn.Embedding(max_seq-1, embed_dim)\n",
    "        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)\n",
    "\n",
    "        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=0.2)\n",
    "\n",
    "        self.dropout = nn.Dropout(0.2)\n",
    "        self.layer_normal = nn.LayerNorm(embed_dim) \n",
    "\n",
    "        self.ffn = FFN(embed_dim)\n",
    "        self.pred = nn.Linear(embed_dim, 1)\n",
    "    \n",
    "    def forward(self, x, question_ids):\n",
    "        device = x.device        \n",
    "        x = self.embedding(x)\n",
    "        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)\n",
    "\n",
    "        pos_x = self.pos_embedding(pos_id)\n",
    "        x = x + pos_x\n",
    "\n",
    "        e = self.e_embedding(question_ids)\n",
    "\n",
    "        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]\n",
    "        e = e.permute(1, 0, 2)\n",
    "        att_mask = future_mask(x.size(0)).to(device)\n",
    "        att_output, att_weight = self.multi_att(e, x, x, attn_mask=att_mask)\n",
    "        att_output = self.layer_normal(att_output + e)\n",
    "        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]\n",
    "\n",
    "        x = self.ffn(att_output)\n",
    "        x = self.layer_normal(x + att_output)\n",
    "        x = self.pred(x)\n",
    "\n",
    "        return x.squeeze(-1), att_weight\n",
    "    \n",
    "    \n",
    "skills = joblib.load(\"/kaggle/input/riiid-sakt-model-dataset-public/skills.pkl.zip\")\n",
    "n_skill = len(skills)\n",
    "group = joblib.load(\"/kaggle/input/riiid-sakt-model-dataset-public/group.pkl.zip\")\n",
    "\n",
    "\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "nn_model = SAKTModel(n_skill, embed_dim=128)\n",
    "try:\n",
    "    nn_model.load_state_dict(torch.load(\"/kaggle/input/riiid-sakt-model-dataset-public/sakt_model.pt\"))\n",
    "except:\n",
    "    nn_model.load_state_dict(torch.load(\"/kaggle/input/riiid-sakt-model-dataset-public/sakt_model.pt\", map_location='cpu'))\n",
    "nn_model.to(device)\n",
    "nn_model.eval()\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TestDataset(Dataset):\n",
    "    def __init__(self, samples, test_df, skills, max_seq=MAX_SEQ): \n",
    "        super(TestDataset, self).__init__()\n",
    "        self.samples = samples\n",
    "        self.user_ids = [x for x in test_df[\"user_id\"].unique()]\n",
    "        self.test_df = test_df\n",
    "        self.skills = skills\n",
    "        self.n_skill = len(skills)\n",
    "        self.max_seq = max_seq\n",
    "\n",
    "    def __len__(self):\n",
    "        return self.test_df.shape[0]\n",
    "\n",
    "    def __getitem__(self, index):\n",
    "        test_info = self.test_df.iloc[index]\n",
    "\n",
    "        user_id = test_info[\"user_id\"]\n",
    "        target_id = test_info[\"content_id\"]\n",
    "\n",
    "        q = np.zeros(self.max_seq, dtype=int)\n",
    "        qa = np.zeros(self.max_seq, dtype=int)\n",
    "\n",
    "        if user_id in self.samples.index:\n",
    "            q_, qa_ = self.samples[user_id]\n",
    "            \n",
    "            seq_len = len(q_)\n",
    "\n",
    "            if seq_len >= self.max_seq:\n",
    "                q = q_[-self.max_seq:]\n",
    "                qa = qa_[-self.max_seq:]\n",
    "            else:\n",
    "                q[-seq_len:] = q_\n",
    "                qa[-seq_len:] = qa_          \n",
    "        \n",
    "        x = np.zeros(self.max_seq-1, dtype=int)\n",
    "        x = q[1:].copy()\n",
    "        x += (qa[1:] == 1) * self.n_skill\n",
    "        \n",
    "        questions = np.append(q[2:], [target_id])\n",
    "        \n",
    "        return x, questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))\n",
    "user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))\n",
    "# user_sum_dict2 = user_agg2['sum'].astype('int16').to_dict(defaultdict(int))\n",
    "# user_count_dict2 = user_agg2['count'].astype('int16').to_dict(defaultdict(int))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#user_var_dict = user_agg['var'].astype('float16').to_dict(defaultdict(int))\n",
    "\n",
    "# content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))\n",
    "# content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))\n",
    "\n",
    "del user_agg\n",
    "#del user_agg2\n",
    "#del content_agg\n",
    "gc.collect()\n",
    "\n",
    "task_container_sum_dict = task_container_agg['sum'].astype('int32').to_dict(defaultdict(int))\n",
    "task_container_count_dict = task_container_agg['count'].astype('int32').to_dict(defaultdict(int))\n",
    "task_container_std_dict = task_container_agg['var'].astype('float16').to_dict(defaultdict(int))\n",
    "\n",
    "explanation_sum_dict = explanation_agg['sum'].astype('int16').to_dict(defaultdict(int))\n",
    "explanation_count_dict = explanation_agg['count'].astype('int16').to_dict(defaultdict(int))\n",
    "#explanation_var_dict = explanation_agg['var'].astype('float16').to_dict(defaultdict(int))\n",
    "del task_container_agg\n",
    "del explanation_agg\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#task_container_std_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_lecture_sum_dict = user_lecture_agg['sum'].astype('int16').to_dict(defaultdict(int))\n",
    "user_lecture_count_dict = user_lecture_agg['count'].astype('int16').to_dict(defaultdict(int))\n",
    "\n",
    "#lagtime_mean_dict = lagtime_agg['mean'].astype('int32').to_dict(defaultdict(int))\n",
    "#del prior_question_elapsed_time_agg\n",
    "del user_lecture_agg\n",
    "#del lagtime_agg\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_timestamp_u_dict=max_timestamp_u.set_index('user_id').to_dict()\n",
    "max_timestamp_u_dict2=max_timestamp_u2.set_index('user_id').to_dict()\n",
    "max_timestamp_u_dict3=max_timestamp_u3.set_index('user_id').to_dict()\n",
    "user_prior_question_elapsed_time_dict=user_prior_question_elapsed_time.set_index('user_id').to_dict()\n",
    "#del question_elapsed_time_agg\n",
    "del max_timestamp_u\n",
    "del max_timestamp_u2\n",
    "del max_timestamp_u3\n",
    "del user_prior_question_elapsed_time\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "attempt_no_sum_dict = attempt_no_agg['sum'].to_dict(defaultdict(int))\n",
    "\n",
    "del attempt_no_agg\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_max_attempt(user_id,content_id):\n",
    "    k = (user_id,content_id)\n",
    "\n",
    "    if k in attempt_no_sum_dict.keys():\n",
    "        attempt_no_sum_dict[k]+=1\n",
    "        return attempt_no_sum_dict[k]\n",
    "\n",
    "    attempt_no_sum_dict[k] = 1\n",
    "    return attempt_no_sum_dict[k]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def get_max_attempt(user_id,content_id):\n",
    "#     global  attempt_no_agg\n",
    "#     #k = (user_id,content_id)\n",
    "#     if(len(attempt_no_agg[(attempt_no_agg['user_id']==user_id) & (attempt_no_agg['content_id'] ==content_id)])==1):\n",
    "#     #if k in attempt_no_sum_dict.keys():\n",
    "#         x= attempt_no_agg.loc[(attempt_no_agg['user_id']==user_id) & (attempt_no_agg['content_id'] ==content_id),'sum'].values \n",
    "#         attempt_no_agg.loc[(attempt_no_agg['user_id']==user_id) & (attempt_no_agg['content_id'] ==content_id),'sum']=x+1\n",
    "#         return x+1\n",
    "    \n",
    "#     attempt_no_agg = attempt_no_agg.append([{'user_id':user_id,'content_id':content_id,'sum':1}], ignore_index=True)\n",
    "#     return 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# model = lgb.Booster(model_file='../input/riiid-lgbm-starter/model.txt')\n",
    "env = riiideducation.make_env()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "iter_test = env.iter_test()\n",
    "prior_test_df = None\n",
    "prev_test_df1 = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "N=[0.4,0.6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "for (test_df, sample_prediction_df) in iter_test:\n",
    "    test_df1=test_df.copy()\n",
    "    if (prev_test_df1 is not None):\n",
    "        prev_test_df1['answered_correctly'] = eval(test_df1['prior_group_answers_correct'].iloc[0])\n",
    "        prev_test_df1 = prev_test_df1[prev_test_df1.content_type_id == False]\n",
    "        \n",
    "        prev_group = prev_test_df1[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (\n",
    "            r['content_id'].values,\n",
    "            r['answered_correctly'].values))\n",
    "        for prev_user_id in prev_group.index:\n",
    "            if prev_user_id in group.index:\n",
    "                group[prev_user_id] = (\n",
    "                    np.append(group[prev_user_id][0], prev_group[prev_user_id][0])[-MAX_SEQ:], \n",
    "                    np.append(group[prev_user_id][1], prev_group[prev_user_id][1])[-MAX_SEQ:]\n",
    "                )\n",
    " \n",
    "            else:\n",
    "                group[prev_user_id] = (\n",
    "                    prev_group[prev_user_id][0], \n",
    "                    prev_group[prev_user_id][1]\n",
    "                )\n",
    "\n",
    "    prev_test_df1 = test_df1.copy()\n",
    "    \n",
    "    test_df1 = test_df1[test_df1.content_type_id == False]\n",
    "    test_dataset = TestDataset(group, test_df1, skills)\n",
    "    test_dataloader = DataLoader(test_dataset, batch_size=51200, shuffle=False)\n",
    "    \n",
    "    outs = []\n",
    "\n",
    "    for item in test_dataloader:\n",
    "        x = item[0].to(device).long()\n",
    "        target_id = item[1].to(device).long()\n",
    "        with torch.no_grad():\n",
    "            output, att_weight = nn_model(x, target_id)\n",
    "        outs.extend(torch.sigmoid(output)[:, -1].view(-1).data.cpu().numpy())\n",
    "        \n",
    "    #test_df1['answered_correctly'] = outs\n",
    "    \n",
    "    \n",
    "    \n",
    "    \n",
    "    ###for lgb\n",
    "    if prior_test_df is not None:\n",
    "        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])\n",
    "        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)\n",
    "        #prior_test_df = prior_test_df[prior_test_df[target] != -1]\n",
    "        prior_test_df['prior_question_had_explanation'].fillna(False, inplace=True)       \n",
    "        prior_test_df.prior_question_had_explanation=prior_test_df.prior_question_had_explanation.astype('int8')\n",
    "    \n",
    "        user_ids = prior_test_df['user_id'].values\n",
    "        #content_ids = prior_test_df['content_id'].values\n",
    "        #task_container_ids = prior_test_df['task_container_id'].values\n",
    "        #prior_question_had_explanations = prior_test_df['prior_question_had_explanation'].values\n",
    "        targets = prior_test_df[target].values        \n",
    "        \n",
    "        for user_id, answered_correctly in zip(user_ids,targets):\n",
    "            user_sum_dict[user_id] += answered_correctly\n",
    "            user_count_dict[user_id] += 1\n",
    "#             user_sum_dict2[user_id] += answered_correctly\n",
    "#             user_count_dict2[user_id] += 1   \n",
    "            \n",
    "\n",
    "    prior_test_df = test_df.copy() \n",
    "        \n",
    "    \n",
    "    question_len=len( test_df[test_df['content_type_id'] == 0])\n",
    "    test_df['prior_question_had_explanation'].fillna(False, inplace=True)\n",
    "    test_df.prior_question_had_explanation=test_df.prior_question_had_explanation.astype('int8')\n",
    "    test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)\n",
    "    \n",
    "\n",
    "    user_lecture_sum = np.zeros(question_len, dtype=np.int16)\n",
    "    user_lecture_count = np.zeros(question_len, dtype=np.int16) \n",
    "    \n",
    "    user_sum = np.zeros(question_len, dtype=np.int16)\n",
    "    user_count = np.zeros(question_len, dtype=np.int16)\n",
    "#     user_sum2 = np.zeros(question_len, dtype=np.int16)\n",
    "#     user_count2 = np.zeros(question_len, dtype=np.int16)\n",
    "\n",
    "#     user_sum_dict_test=defaultdict(int)\n",
    "#     user_count_dict_test=defaultdict(int)\n",
    "\n",
    "    task_container_sum = np.zeros(question_len, dtype=np.int32)\n",
    "    task_container_count = np.zeros(question_len, dtype=np.int32)\n",
    "    task_container_std = np.zeros(question_len, dtype=np.float16)\n",
    "\n",
    "    explanation_sum = np.zeros(question_len, dtype=np.int32)\n",
    "    explanation_count = np.zeros(question_len, dtype=np.int32)\n",
    "    delta_prior_question_elapsed_time = np.zeros(question_len, dtype=np.int32)\n",
    "\n",
    "    attempt_no_count = np.zeros(question_len, dtype=np.int16)\n",
    "    lagtime = np.zeros(question_len, dtype=np.float32)\n",
    "    lagtime2 = np.zeros(question_len, dtype=np.float32)\n",
    "    lagtime3 = np.zeros(question_len, dtype=np.float32)\n",
    "    #lagtime_means = np.zeros(question_len, dtype=np.int32)\n",
    "    #\n",
    "   \n",
    "    i=0\n",
    "    for j, (user_id,prior_question_had_explanation,content_type_id,prior_question_elapsed_time,timestamp, content_id,task_container_id) in enumerate(zip(test_df['user_id'].values,test_df['prior_question_had_explanation'].values,test_df['content_type_id'].values,test_df['prior_question_elapsed_time'].values,test_df['timestamp'].values, test_df['content_id'].values, test_df['task_container_id'].values)):\n",
    "        \n",
    "         #\n",
    "        user_lecture_sum_dict[user_id] += content_type_id\n",
    "        user_lecture_count_dict[user_id] += 1\n",
    "        if(content_type_id==1):#\n",
    "            x=1\n",
    "#             if(len(user_lecture_stats_part[user_lecture_stats_part.user_id==user_id])==0):\n",
    "#                 user_lecture_stats_part = user_lecture_stats_part.append([{'user_id':user_id}], ignore_index=True)\n",
    "#                 user_lecture_stats_part.fillna(0, inplace=True)\n",
    "#                 user_lecture_stats_part.loc[user_lecture_stats_part.user_id==user_id,part_lectures_columns + types_of_lectures_columns]+=lectures_df[lectures_df.lecture_id==content_id][part_lectures_columns + types_of_lectures_columns].values\n",
    "#             else:\n",
    "#                 user_lecture_stats_part.loc[user_lecture_stats_part.user_id==user_id,part_lectures_columns + types_of_lectures_columns]+=lectures_df[lectures_df.lecture_id==content_id][part_lectures_columns + types_of_lectures_columns].values\n",
    "        if(content_type_id==0):#   \n",
    "            user_lecture_sum[i] = user_lecture_sum_dict[user_id]\n",
    "            user_lecture_count[i] = user_lecture_count_dict[user_id]\n",
    "                \n",
    "            user_sum[i] = user_sum_dict[user_id]\n",
    "            user_count[i] = user_count_dict[user_id]\n",
    "#             user_sum2[i] = user_sum_dict2[user_id]\n",
    "#             user_count2[i] = user_count_dict2[user_id]\n",
    "    #         content_sum[i] = content_sum_dict[content_id]\n",
    "    #         content_count[i] = content_count_dict[content_id]\n",
    "            task_container_sum[i] = task_container_sum_dict[task_container_id]\n",
    "            task_container_count[i] = task_container_count_dict[task_container_id]\n",
    "            task_container_std[i]=task_container_std_dict[task_container_id]\n",
    "\n",
    "            explanation_sum_dict[user_id] += prior_question_had_explanation\n",
    "            explanation_count_dict[user_id] += 1\n",
    "            explanation_sum[i] = explanation_sum_dict[user_id]\n",
    "            explanation_count[i] = explanation_count_dict[user_id]\n",
    "\n",
    "            if user_id in max_timestamp_u_dict['max_time_stamp'].keys():\n",
    "                lagtime[i]=timestamp-max_timestamp_u_dict['max_time_stamp'][user_id]\n",
    "                if(max_timestamp_u_dict2['max_time_stamp2'][user_id]==lagtime_mean2):#\n",
    "                    lagtime2[i]=lagtime_mean2\n",
    "                    lagtime3[i]=lagtime_mean3\n",
    "                    #max_timestamp_u_dict3['max_time_stamp3'].update({user_id:lagtime_mean3})\n",
    "                else:\n",
    "                    lagtime2[i]=timestamp-max_timestamp_u_dict2['max_time_stamp2'][user_id]\n",
    "                    if(max_timestamp_u_dict3['max_time_stamp3'][user_id]==lagtime_mean3):\n",
    "                        lagtime3[i]=lagtime_mean3 #\n",
    "                    else:\n",
    "                        lagtime3[i]=timestamp-max_timestamp_u_dict3['max_time_stamp3'][user_id]\n",
    "                    \n",
    "                    max_timestamp_u_dict3['max_time_stamp3'][user_id]=max_timestamp_u_dict2['max_time_stamp2'][user_id]\n",
    "                        \n",
    "                max_timestamp_u_dict2['max_time_stamp2'][user_id]=max_timestamp_u_dict['max_time_stamp'][user_id]\n",
    "                max_timestamp_u_dict['max_time_stamp'][user_id]=timestamp\n",
    "#                 lagtime_means[i]=(lagtime_mean_dict[user_id]+lagtime[i])/2\n",
    "#                 lagtime_mean_dict[user_id]=lagtime_means[i]\n",
    "            else:\n",
    "                lagtime[i]=lagtime_mean\n",
    "                max_timestamp_u_dict['max_time_stamp'].update({user_id:timestamp})\n",
    "                lagtime2[i]=lagtime_mean2#\n",
    "                max_timestamp_u_dict2['max_time_stamp2'].update({user_id:lagtime_mean2})\n",
    "                lagtime3[i]=lagtime_mean3#\n",
    "                max_timestamp_u_dict3['max_time_stamp3'].update({user_id:lagtime_mean3})\n",
    "#                 lagtime_mean_dict.update({user_id:lagtime_mean})\n",
    "#                 lagtime_means[i]=(lagtime_mean_dict[user_id]+lagtime[i])/2\n",
    "\n",
    "            if user_id in user_prior_question_elapsed_time_dict['prior_question_elapsed_time'].keys():            \n",
    "                delta_prior_question_elapsed_time[i]=prior_question_elapsed_time-user_prior_question_elapsed_time_dict['prior_question_elapsed_time'][user_id]\n",
    "                user_prior_question_elapsed_time_dict['prior_question_elapsed_time'][user_id]=prior_question_elapsed_time\n",
    "            else:           \n",
    "                delta_prior_question_elapsed_time[i]=delta_prior_question_elapsed_time_mean    \n",
    "                user_prior_question_elapsed_time_dict['prior_question_elapsed_time'].update({user_id:prior_question_elapsed_time})\n",
    "            i=i+1 \n",
    "\n",
    "\n",
    "        \n",
    "    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)\n",
    "    #test_df = test_df[test_df['content_type_id'] == 0]\n",
    "    #right_index=True\n",
    "    #test_df = pd.merge(test_df, questions_df, on='content_id', how='left',right_index=True)    \n",
    "    #test_df = pd.concat([test_df.reset_index(drop=True), questions_df.reindex(test_df['content_id'].values).reset_index(drop=True)], axis=1)\n",
    "    test_df=test_df.merge(questions_df.loc[questions_df.index.isin(test_df['content_id'])],\n",
    "                  how='left', on='content_id', right_index=True)\n",
    "    \n",
    "    #test_df = pd.merge(test_df, user_lecture_stats_part, on=['user_id'], how=\"left\",right_index=True)\n",
    "    #test_df = pd.concat([test_df.reset_index(drop=True), user_lecture_stats_part.reindex(test_df['user_id'].values).reset_index(drop=True)], axis=1)\n",
    "#     test_df=test_df.merge(user_lecture_stats_part.loc[user_lecture_stats_part.index.isin(test_df['user_id'])],\n",
    "#                   how='left', on='user_id', right_index=True)\n",
    " \n",
    "    test_df['user_lecture_lv'] = user_lecture_sum / user_lecture_count\n",
    "    test_df['user_lecture_sum'] = user_lecture_sum\n",
    "    \n",
    "    test_df['user_interaction_count'] = user_lecture_count\n",
    "    test_df['user_interaction_timestamp_mean'] = test_df['timestamp']/user_lecture_count\n",
    "    \n",
    "    test_df['user_correctness'] = user_sum / user_count\n",
    "    test_df['user_uncorrect_count'] =user_count-user_sum\n",
    "    test_df['user_correct_count'] =user_sum\n",
    "    #test_df['user_answer_count'] =user_count\n",
    "    \n",
    "#     test_df['user_correctness2'] = user_sum2 / user_count2\n",
    "#     test_df['user_uncorrect_count2'] =user_count2-user_sum2\n",
    "#     test_df['user_correct_count2'] =user_sum2\n",
    "    #test_df['user_answer_count2'] =user_count2\n",
    "    \n",
    "    #    \n",
    "    test_df['task_container_correctness'] = task_container_sum / task_container_count\n",
    "    test_df['task_container_cor_count'] = task_container_sum \n",
    "    test_df['task_container_uncor_count'] =task_container_count-task_container_sum \n",
    "    test_df['task_container_std'] = task_container_std \n",
    "    #test_df['content_task_mean'] = content_task_mean \n",
    "    \n",
    "    test_df['explanation_mean'] = explanation_sum / explanation_count\n",
    "    test_df['explanation_true_count'] = explanation_sum\n",
    "    test_df['explanation_false_count'] = explanation_count-explanation_sum \n",
    "    \n",
    "    #\n",
    "    test_df['delta_prior_question_elapsed_time'] = delta_prior_question_elapsed_time \n",
    "    \n",
    "  \n",
    " \n",
    "    test_df[\"attempt_no\"] = test_df[[\"user_id\", \"content_id\"]].apply(lambda row: get_max_attempt(row[\"user_id\"], row[\"content_id\"]), axis=1)\n",
    "    test_df[\"lagtime\"]=lagtime\n",
    "    test_df[\"lagtime2\"]=lagtime2\n",
    "    test_df[\"lagtime3\"]=lagtime3\n",
    "    #test_df[\"lagtime_mean\"]=lagtime_means\n",
    "\n",
    "    \n",
    "\n",
    "    test_df['timestamp']=test_df['timestamp']/(1000*3600)\n",
    "    test_df.timestamp=test_df.timestamp.astype('float16')\n",
    "    test_df['lagtime']=test_df['lagtime']/(1000*3600)\n",
    "    test_df.lagtime=test_df.lagtime.astype('float32')\n",
    "    test_df['lagtime2']=test_df['lagtime2']/(1000*3600)\n",
    "    test_df.lagtime2=test_df.lagtime2.astype('float32')\n",
    "    test_df['lagtime3']=test_df['lagtime3']/(1000*3600)\n",
    "    test_df.lagtime3=test_df.lagtime3.astype('float32')\n",
    "    test_df['user_interaction_timestamp_mean']=test_df['user_interaction_timestamp_mean']/(1000*3600)\n",
    "    test_df.user_interaction_timestamp_mean=test_df.user_interaction_timestamp_mean.astype('float32')\n",
    "    \n",
    "    test_df['user_correctness'].fillna(0.67, inplace=True)\n",
    "    #test_df['user_correctness2'].fillna(0.67, inplace=True)\n",
    "    #\n",
    "    #test_df = test_df.astype(features_dict)\n",
    "\n",
    "    sub_preds = np.zeros(test_df.shape[0])\n",
    "    for i, model in enumerate(clfs, 1):\n",
    "        test_preds  = model.predict(test_df[features])\n",
    "        sub_preds += test_preds\n",
    "    o2=sub_preds / len(clfs)\n",
    "    test_df[target]=0.5*np.array(outs)+0.5*np.array(o2)\n",
    "#     if(flag_lgbm):\n",
    "#         test_df[target] = model.predict(test_df[features])\n",
    "#     else:\n",
    "#         test_df[target] = model.predict(test_df[features].values)\n",
    "    env.predict(test_df[['row_id', target]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
